#include "preprocess.h"

Preprocess:: Preprocess(string set1_name, string set2_name)
{
    readData(set1_name, set2_name);
}

int token = 1;

void Preprocess::transformObjects()
{
    for (int i = 0; i <= set1_source.size() - 1; i++)
        addObjectTokens(set1_source[i], 1);

    for (int i = 0; i <= set2_source.size() - 1; i++)
        addObjectTokens(set2_source[i], 2);

    map<pair<string,int>, int>::iterator i;
    for (i = word_token_map.begin(); i != word_token_map.end(); i++)
        Statistic::instance().token_freq_map.insert(pair<int,int>(i->second, word_freq_map[i->first]));

    for (int i = 0; i <= set1_dest.size() - 1; i++)
        sort(set1_dest[i].data.begin(), set1_dest[i].data.end());

    for (int i = 0; i <= set2_dest.size() - 1; i++)
        sort(set2_dest[i].data.begin(), set2_dest[i].data.end());

    sort(set1_dest.begin(), set1_dest.end());
    sort(set2_dest.begin(), set2_dest.end());

    if (set1_dest.size() < set2_dest.size()) set1_dest.swap(set2_dest);
}

void Preprocess::addObjectTokens(string str_object, int set_number)
{
    Tokenizer str;
    vector<string> words_object;
    Object tokens_object;

    str.set(str_object);
    words_object = str.split();

    map<string, int> word_object_map;  //map word to its frequency in 1 object
    countWordFreq(words_object, word_object_map);

    mergeMaps(word_object_map, tokens_object);    

    tokens_object.score = stod(words_object[words_object.size() - 1]);

    if (set_number == 1)
        set1_dest.push_back(tokens_object);
    else set2_dest.push_back(tokens_object);
}

void Preprocess::countWordFreq(vector<string> words, map<string,int>& word_object_map)
{
    map<string,int>::iterator got;
    for(int i = 0; i <= words.size() - 2; i++) //last one - score
    {
        got = word_object_map.find(words[i]);
        if (got == word_object_map.end())
            word_object_map.insert(pair<string,int>(words[i],1));
        else
            word_object_map[words[i]]++;
    }
}

void Preprocess::mergeMaps(map<string, int>& word_object_map, Object& tokens_object)
{
    map<string, int>::iterator i;

    for (i = word_object_map.begin(); i != word_object_map.end(); i++)
    {
        map<pair<string,int>,int>::iterator word;
        for (int freq = 1; freq <= i->second; freq++)
        {
            word = word_freq_map.find(pair<string,int>(i->first,freq));
            if (word == word_freq_map.end())
            {
                word_freq_map.insert(pair<pair<string,int>, int> (pair<string,int>(i->first,freq),1));
                word_token_map.insert(pair<pair<string,int>, int> (pair<string,int>(i->first,freq),token));
                tokens_object.data.push_back(Token(token));
                token++;              
            }
            else
            {
                word_freq_map[word->first]++;
                tokens_object.data.push_back(Token(word_token_map[word->first]));
            }
        }
    }
}

void Preprocess::readData(string set1_name, string set2_name)
{
    ifstream set1_file(set1_name);
    ifstream set2_file(set2_name);

    string object;

    while(!set1_file.eof())
    {
        getline(set1_file,object);
        if (!object.empty())
            set1_source.push_back(object);
    }

    while(!set2_file.eof())
    {        
        getline(set2_file,object);
        if (!object.empty())
            set2_source.push_back(object);
    }
}
